/*cd "E:\My Drive\Wage_Rigidity\Surveys"
global graphs "./graphs"
global log "./log"
global data "./regulation gov"

set scheme plotplain
graph set window fontface "Times New Roman"

capture log close
log using "$log/1_clean_comments", replace*/

import excel "$data/all_comments.xlsx", sheet("all_comments") firstrow case(lower) clear

*Drop proposal & transcripts
drop if documenttype=="Proposed Rule" | documenttype=="Supporting & Related Material"

*Campaign message
gen spam = strpos(comment, "in 1975, President Gerald Fords") | strpos(comment,"In 1975, President Gerald Fords") ///
		| strpos(comment,"In 1975, President Gerald Ford's") | strpos(comment,"in 1975, President Gerald Ford's") ///
		| strpos(comment,"In 1975 President Gerald Fords") | strpos(comment,"in 1975, President Fords Labor Dept")
tab spam 
drop if spam>0 & attachmentfiles=="" //Always keep any with attachments

*More spam 
gen spam2= strpos(comment,"Right now, too many people in our country are overworked and underpaid")
tab spam2 
drop if spam2>0 & attachmentfiles==""

*More more spam
gen spam3=strpos(comment,"I urge the Department of Labor to implement the new overtime rule, keep the salary threshold at at least $47,476")
tab spam3
drop if spam3>0 & attachmentfiles==""

*More more more spam
gen spam4=strpos(comment,"I urge the Department of Labor to protect fair pay for working women and their families by defending and implementing the updated overtime rule issued in 2016 and keeping the salary threshold at no less than $47,476, with automatic increases at least every three years.")
tab spam4
drop if spam4>0 & attachmentfiles==""

*Duplicates! Duh! 
	duplicates tag comment, gen(tag)
	tab tag 
	drop if tag>0 & attachmentfiles==""

*Most short comments are not useful
gen length=length(comment)
tab length
drop if length<1000 & attachmentfiles==""

	*Many same lengths are also spam/duplicates but with different signatures
	bys length: gegen count=count(length)
	drop if count>=30 & attachmentfiles==""
	sum count, det 
	bro if count==`r(max)'

*Save
export excel using "$data/dropped_spam.xlsx", firstrow(variables) replace
save "$data/dropped_spam.dta", replace

*************************************************************
*Look up useful ones
*************************************************************
use "$data/dropped_spam.dta", clear 
*Even many long ones look useless. Are there any that quote the DOL's question?
gen relevant=strpos(comment, "To what extent did employers") | strpos(comment,"in anticipation of the 2016 Final Rule's")
tab relevant 
bro if relevant>0 //These 2 are good! Length is min 1800

*Any comments about monitoring hours?
gen monitor=strpos(comment, "monitor")
tab monitor 
bro if monitor>0 // The third one tried to answer question 6 too... length 2775

*Look for more who answered question 
capture drop relevant2
gen relevant2=strpos(comment, " 6. ")
tab relevant2
bro if relevant2>0 // 1 of length 2295

*Look for more who answered question 
gen relevant3=strpos(comment, "6.")
tab relevant3
bro if relevant3>0

*Files with attachments
preserve
	keep if attachmentfiles	!=""
	export excel using "$data/attachments.xlsx", firstrow(variables) replace
restore 

*Files without attachments but useful comments
preserve 
	keep if attachmentfiles==""
	bro comment relevant* monitor attachmentfiles if relevant>0 | relevant2>0 | relevant3>0 | monitor>0
	keep if relevant>0 | relevant2>0 | relevant3>0 | monitor>0
	export excel using "$data/relevant.xlsx", firstrow(variables) replace
restore 

*Other 
	gen useful=relevant>0 | relevant2>0 | relevant3>0 | monitor>0 | attachmentfiles!=""
	bro comment relevant* monitor attachmentfiles if useful==0
	keep if useful==0
	export excel using "$data/other.xlsx", firstrow(variables) replace


